import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('C://Users//Lenovo//Downloads//SampleSuperstore.csv//SampleSuperstore.csv')
df.head()
| Ship Mode | Segment | Country | City | State | Postal Code | Region | Category | Sub-Category | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Bookcases | 261.9600 | 2 | 0.00 | 41.9136 |
| 1 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Chairs | 731.9400 | 3 | 0.00 | 219.5820 |
| 2 | Second Class | Corporate | United States | Los Angeles | California | 90036 | West | Office Supplies | Labels | 14.6200 | 2 | 0.00 | 6.8714 |
| 3 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Furniture | Tables | 957.5775 | 5 | 0.45 | -383.0310 |
| 4 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Office Supplies | Storage | 22.3680 | 2 | 0.20 | 2.5164 |
df.tail()
| Ship Mode | Segment | Country | City | State | Postal Code | Region | Category | Sub-Category | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9989 | Second Class | Consumer | United States | Miami | Florida | 33180 | South | Furniture | Furnishings | 25.248 | 3 | 0.2 | 4.1028 |
| 9990 | Standard Class | Consumer | United States | Costa Mesa | California | 92627 | West | Furniture | Furnishings | 91.960 | 2 | 0.0 | 15.6332 |
| 9991 | Standard Class | Consumer | United States | Costa Mesa | California | 92627 | West | Technology | Phones | 258.576 | 2 | 0.2 | 19.3932 |
| 9992 | Standard Class | Consumer | United States | Costa Mesa | California | 92627 | West | Office Supplies | Paper | 29.600 | 4 | 0.0 | 13.3200 |
| 9993 | Second Class | Consumer | United States | Westminster | California | 92683 | West | Office Supplies | Appliances | 243.160 | 2 | 0.0 | 72.9480 |
df.shape
(9994, 13)
df.dtypes
Ship Mode object Segment object Country object City object State object Postal Code int64 Region object Category object Sub-Category object Sales float64 Quantity int64 Discount float64 Profit float64 dtype: object
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9994 entries, 0 to 9993 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Ship Mode 9994 non-null object 1 Segment 9994 non-null object 2 Country 9994 non-null object 3 City 9994 non-null object 4 State 9994 non-null object 5 Postal Code 9994 non-null int64 6 Region 9994 non-null object 7 Category 9994 non-null object 8 Sub-Category 9994 non-null object 9 Sales 9994 non-null float64 10 Quantity 9994 non-null int64 11 Discount 9994 non-null float64 12 Profit 9994 non-null float64 dtypes: float64(3), int64(2), object(8) memory usage: 1015.1+ KB
df.isnull().sum()
Ship Mode 0 Segment 0 Country 0 City 0 State 0 Postal Code 0 Region 0 Category 0 Sub-Category 0 Sales 0 Quantity 0 Discount 0 Profit 0 dtype: int64
df.duplicated().sum()
17
df = df.drop_duplicates()
# df.var()
df.cov()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1545644723.py:1: FutureWarning: The default value of numeric_only in DataFrame.cov is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. df.cov()
| Postal Code | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|
| Postal Code | 1.027732e+09 | -469412.768497 | 935.818272 | 391.989291 | -224679.858643 |
| Sales | -4.694128e+05 | 389028.396022 | 278.765576 | -3.645637 | 70057.067126 |
| Quantity | 9.358183e+02 | 278.765576 | 4.958001 | 0.003990 | 34.565743 |
| Discount | 3.919893e+02 | -3.645637 | 0.003990 | 0.042624 | -10.632751 |
| Profit | -2.246799e+05 | 70057.067126 | 34.565743 | -10.632751 | 54970.478824 |
df.corr()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1134722465.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. df.corr()
| Postal Code | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|
| Postal Code | 1.000000 | -0.023476 | 0.013110 | 0.059225 | -0.029892 |
| Sales | -0.023476 | 1.000000 | 0.200722 | -0.028311 | 0.479067 |
| Quantity | 0.013110 | 0.200722 | 1.000000 | 0.008678 | 0.066211 |
| Discount | 0.059225 | -0.028311 | 0.008678 | 1.000000 | -0.219662 |
| Profit | -0.029892 | 0.479067 | 0.066211 | -0.219662 | 1.000000 |
df.describe()
| Postal Code | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|
| count | 9977.000000 | 9977.000000 | 9977.000000 | 9977.000000 | 9977.00000 |
| mean | 55154.964117 | 230.148902 | 3.790719 | 0.156278 | 28.69013 |
| std | 32058.266816 | 623.721409 | 2.226657 | 0.206455 | 234.45784 |
| min | 1040.000000 | 0.444000 | 1.000000 | 0.000000 | -6599.97800 |
| 25% | 23223.000000 | 17.300000 | 2.000000 | 0.000000 | 1.72620 |
| 50% | 55901.000000 | 54.816000 | 3.000000 | 0.200000 | 8.67100 |
| 75% | 90008.000000 | 209.970000 | 5.000000 | 0.200000 | 29.37200 |
| max | 99301.000000 | 22638.480000 | 14.000000 | 0.800000 | 8399.97600 |
df['Ship Mode'].value_counts()
Standard Class 5955 Second Class 1943 First Class 1537 Same Day 542 Name: Ship Mode, dtype: int64
df['Segment'].value_counts()
Consumer 5183 Corporate 3015 Home Office 1779 Name: Segment, dtype: int64
df['Category'].value_counts()
Office Supplies 6012 Furniture 2118 Technology 1847 Name: Category, dtype: int64
df['Sub-Category'].value_counts()
Binders 1522 Paper 1359 Furnishings 956 Phones 889 Storage 846 Art 795 Accessories 775 Chairs 615 Appliances 466 Labels 363 Tables 319 Envelopes 254 Bookcases 228 Fasteners 217 Supplies 190 Machines 115 Copiers 68 Name: Sub-Category, dtype: int64
df['Sales'].plot(kind='box')
plt.show()
df['Quantity'].value_counts()
3 2400 2 2399 5 1230 4 1186 1 899 7 606 6 572 9 258 8 257 10 57 11 34 14 29 13 27 12 23 Name: Quantity, dtype: int64
df['Discount'].plot(kind='kde')
plt.show()
df['Profit'].plot(kind='kde')
plt.show()
df['Country'].unique()
array(['United States'], dtype=object)
df['City'].unique()
array(['Henderson', 'Los Angeles', 'Fort Lauderdale', 'Concord',
'Seattle', 'Fort Worth', 'Madison', 'West Jordan', 'San Francisco',
'Fremont', 'Philadelphia', 'Orem', 'Houston', 'Richardson',
'Naperville', 'Melbourne', 'Eagan', 'Westland', 'Dover',
'New Albany', 'New York City', 'Troy', 'Chicago', 'Gilbert',
'Springfield', 'Jackson', 'Memphis', 'Decatur', 'Durham',
'Columbia', 'Rochester', 'Minneapolis', 'Portland', 'Saint Paul',
'Aurora', 'Charlotte', 'Orland Park', 'Urbandale', 'Columbus',
'Bristol', 'Wilmington', 'Bloomington', 'Phoenix', 'Roseville',
'Independence', 'Pasadena', 'Newark', 'Franklin', 'Scottsdale',
'San Jose', 'Edmond', 'Carlsbad', 'San Antonio', 'Monroe',
'Fairfield', 'Grand Prairie', 'Redlands', 'Hamilton', 'Westfield',
'Akron', 'Denver', 'Dallas', 'Whittier', 'Saginaw', 'Medina',
'Dublin', 'Detroit', 'Tampa', 'Santa Clara', 'Lakeville',
'San Diego', 'Brentwood', 'Chapel Hill', 'Morristown',
'Cincinnati', 'Inglewood', 'Tamarac', 'Colorado Springs',
'Belleville', 'Taylor', 'Lakewood', 'Arlington', 'Arvada',
'Hackensack', 'Saint Petersburg', 'Long Beach', 'Hesperia',
'Murfreesboro', 'Layton', 'Austin', 'Lowell', 'Manchester',
'Harlingen', 'Tucson', 'Quincy', 'Pembroke Pines', 'Des Moines',
'Peoria', 'Las Vegas', 'Warwick', 'Miami', 'Huntington Beach',
'Richmond', 'Louisville', 'Lawrence', 'Canton', 'New Rochelle',
'Gastonia', 'Jacksonville', 'Auburn', 'Norman', 'Park Ridge',
'Amarillo', 'Lindenhurst', 'Huntsville', 'Fayetteville',
'Costa Mesa', 'Parker', 'Atlanta', 'Gladstone', 'Great Falls',
'Lakeland', 'Montgomery', 'Mesa', 'Green Bay', 'Anaheim',
'Marysville', 'Salem', 'Laredo', 'Grove City', 'Dearborn',
'Warner Robins', 'Vallejo', 'Mission Viejo', 'Rochester Hills',
'Plainfield', 'Sierra Vista', 'Vancouver', 'Cleveland', 'Tyler',
'Burlington', 'Waynesboro', 'Chester', 'Cary', 'Palm Coast',
'Mount Vernon', 'Hialeah', 'Oceanside', 'Evanston', 'Trenton',
'Cottage Grove', 'Bossier City', 'Lancaster', 'Asheville',
'Lake Elsinore', 'Omaha', 'Edmonds', 'Santa Ana', 'Milwaukee',
'Florence', 'Lorain', 'Linden', 'Salinas', 'New Brunswick',
'Garland', 'Norwich', 'Alexandria', 'Toledo', 'Farmington',
'Riverside', 'Torrance', 'Round Rock', 'Boca Raton',
'Virginia Beach', 'Murrieta', 'Olympia', 'Washington',
'Jefferson City', 'Saint Peters', 'Rockford', 'Brownsville',
'Yonkers', 'Oakland', 'Clinton', 'Encinitas', 'Roswell',
'Jonesboro', 'Antioch', 'Homestead', 'La Porte', 'Lansing',
'Cuyahoga Falls', 'Reno', 'Harrisonburg', 'Escondido', 'Royal Oak',
'Rockville', 'Coral Springs', 'Buffalo', 'Boynton Beach',
'Gulfport', 'Fresno', 'Greenville', 'Macon', 'Cedar Rapids',
'Providence', 'Pueblo', 'Deltona', 'Murray', 'Middletown',
'Freeport', 'Pico Rivera', 'Provo', 'Pleasant Grove', 'Smyrna',
'Parma', 'Mobile', 'New Bedford', 'Irving', 'Vineland', 'Glendale',
'Niagara Falls', 'Thomasville', 'Westminster', 'Coppell', 'Pomona',
'North Las Vegas', 'Allentown', 'Tempe', 'Laguna Niguel',
'Bridgeton', 'Everett', 'Watertown', 'Appleton', 'Bellevue',
'Allen', 'El Paso', 'Grapevine', 'Carrollton', 'Kent', 'Lafayette',
'Tigard', 'Skokie', 'Plano', 'Suffolk', 'Indianapolis', 'Bayonne',
'Greensboro', 'Baltimore', 'Kenosha', 'Olathe', 'Tulsa', 'Redmond',
'Raleigh', 'Muskogee', 'Meriden', 'Bowling Green', 'South Bend',
'Spokane', 'Keller', 'Port Orange', 'Medford', 'Charlottesville',
'Missoula', 'Apopka', 'Reading', 'Broomfield', 'Paterson',
'Oklahoma City', 'Chesapeake', 'Lubbock', 'Johnson City',
'San Bernardino', 'Leominster', 'Bozeman', 'Perth Amboy',
'Ontario', 'Rancho Cucamonga', 'Moorhead', 'Mesquite', 'Stockton',
'Ormond Beach', 'Sunnyvale', 'York', 'College Station',
'Saint Louis', 'Manteca', 'San Angelo', 'Salt Lake City',
'Knoxville', 'Little Rock', 'Lincoln Park', 'Marion', 'Littleton',
'Bangor', 'Southaven', 'New Castle', 'Midland', 'Sioux Falls',
'Fort Collins', 'Clarksville', 'Sacramento', 'Thousand Oaks',
'Malden', 'Holyoke', 'Albuquerque', 'Sparks', 'Coachella',
'Elmhurst', 'Passaic', 'North Charleston', 'Newport News',
'Jamestown', 'Mishawaka', 'La Quinta', 'Tallahassee', 'Nashville',
'Bellingham', 'Woodstock', 'Haltom City', 'Wheeling',
'Summerville', 'Hot Springs', 'Englewood', 'Las Cruces', 'Hoover',
'Frisco', 'Vacaville', 'Waukesha', 'Bakersfield', 'Pompano Beach',
'Corpus Christi', 'Redondo Beach', 'Orlando', 'Orange',
'Lake Charles', 'Highland Park', 'Hempstead', 'Noblesville',
'Apple Valley', 'Mount Pleasant', 'Sterling Heights', 'Eau Claire',
'Pharr', 'Billings', 'Gresham', 'Chattanooga', 'Meridian',
'Bolingbrook', 'Maple Grove', 'Woodland', 'Missouri City',
'Pearland', 'San Mateo', 'Grand Rapids', 'Visalia',
'Overland Park', 'Temecula', 'Yucaipa', 'Revere', 'Conroe',
'Tinley Park', 'Dubuque', 'Dearborn Heights', 'Santa Fe',
'Hickory', 'Carol Stream', 'Saint Cloud', 'North Miami',
'Plantation', 'Port Saint Lucie', 'Rock Hill', 'Odessa',
'West Allis', 'Chula Vista', 'Manhattan', 'Altoona', 'Thornton',
'Champaign', 'Texarkana', 'Edinburg', 'Baytown', 'Greenwood',
'Woonsocket', 'Superior', 'Bedford', 'Covington', 'Broken Arrow',
'Miramar', 'Hollywood', 'Deer Park', 'Wichita', 'Mcallen',
'Iowa City', 'Boise', 'Cranston', 'Port Arthur', 'Citrus Heights',
'The Colony', 'Daytona Beach', 'Bullhead City', 'Portage', 'Fargo',
'Elkhart', 'San Gabriel', 'Margate', 'Sandy Springs', 'Mentor',
'Lawton', 'Hampton', 'Rome', 'La Crosse', 'Lewiston',
'Hattiesburg', 'Danville', 'Logan', 'Waterbury', 'Athens',
'Avondale', 'Marietta', 'Yuma', 'Wausau', 'Pasco', 'Oak Park',
'Pensacola', 'League City', 'Gaithersburg', 'Lehi', 'Tuscaloosa',
'Moreno Valley', 'Georgetown', 'Loveland', 'Chandler', 'Helena',
'Kirkwood', 'Waco', 'Frankfort', 'Bethlehem', 'Grand Island',
'Woodbury', 'Rogers', 'Clovis', 'Jupiter', 'Santa Barbara',
'Cedar Hill', 'Norfolk', 'Draper', 'Ann Arbor', 'La Mesa',
'Pocatello', 'Holland', 'Milford', 'Buffalo Grove', 'Lake Forest',
'Redding', 'Chico', 'Utica', 'Conway', 'Cheyenne', 'Owensboro',
'Caldwell', 'Kenner', 'Nashua', 'Bartlett', 'Redwood City',
'Lebanon', 'Santa Maria', 'Des Plaines', 'Longview',
'Hendersonville', 'Waterloo', 'Cambridge', 'Palatine', 'Beverly',
'Eugene', 'Oxnard', 'Renton', 'Glenview', 'Delray Beach',
'Commerce City', 'Texas City', 'Wilson', 'Rio Rancho', 'Goldsboro',
'Montebello', 'El Cajon', 'Beaumont', 'West Palm Beach', 'Abilene',
'Normal', 'Saint Charles', 'Camarillo', 'Hillsboro', 'Burbank',
'Modesto', 'Garden City', 'Atlantic City', 'Longmont', 'Davis',
'Morgan Hill', 'Clifton', 'Sheboygan', 'East Point', 'Rapid City',
'Andover', 'Kissimmee', 'Shelton', 'Danbury', 'Sanford',
'San Marcos', 'Greeley', 'Mansfield', 'Elyria', 'Twin Falls',
'Coral Gables', 'Romeoville', 'Marlborough', 'Laurel', 'Bryan',
'Pine Bluff', 'Aberdeen', 'Hagerstown', 'East Orange',
'Arlington Heights', 'Oswego', 'Coon Rapids', 'San Clemente',
'San Luis Obispo', 'Springdale', 'Lodi', 'Mason'], dtype=object)
df['State'].unique()
array(['Kentucky', 'California', 'Florida', 'North Carolina',
'Washington', 'Texas', 'Wisconsin', 'Utah', 'Nebraska',
'Pennsylvania', 'Illinois', 'Minnesota', 'Michigan', 'Delaware',
'Indiana', 'New York', 'Arizona', 'Virginia', 'Tennessee',
'Alabama', 'South Carolina', 'Oregon', 'Colorado', 'Iowa', 'Ohio',
'Missouri', 'Oklahoma', 'New Mexico', 'Louisiana', 'Connecticut',
'New Jersey', 'Massachusetts', 'Georgia', 'Nevada', 'Rhode Island',
'Mississippi', 'Arkansas', 'Montana', 'New Hampshire', 'Maryland',
'District of Columbia', 'Kansas', 'Vermont', 'Maine',
'South Dakota', 'Idaho', 'North Dakota', 'Wyoming',
'West Virginia'], dtype=object)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Convert 'Ship Mode' column to categorical type
df['Ship Mode'] = df['Ship Mode'].astype('category')
# Plot count using seaborn
sns.countplot(data=df, x='Ship Mode', palette='Set2')
plt.show()
# Convert 'Segment' column to categorical type
df['Segment'] = df['Segment'].astype('category')
# Plot count using seaborn
sns.countplot(data=df, x='Segment', palette='Set2')
plt.show()
# sns.barplot(df['State'], df['Sales'], palette='Set2')
# sns.barplot(df['City'], df['Sales'], palette='Set2')
plt.figure(figsize=(30,10))
df['State'].value_counts().plot.bar()
plt.show()
# Convert 'Region' column to categorical type
df['Region'] = df['Region'].astype('category')
# Plot count using seaborn
sns.countplot(data=df, x='Region', palette='Set2')
plt.show()
# Convert 'Category' column to categorical type
df['Category'] = df['Category'].astype('category')
# Plot count using seaborn
sns.countplot(data=df, x='Category', palette='Set2')
plt.show()
plt.figure(figsize=(20,10))
sns.countplot(x=df['Sub-Category'], palette='Set2',)
plt.show()
df['Sales'].value_counts()
12.960 55
19.440 37
15.552 37
10.368 35
25.920 34
..
4.240 1
319.960 1
646.740 1
81.940 1
243.160 1
Name: Sales, Length: 5825, dtype: int64
df['Sales'].value_counts().sum()
9977
plt.figure(figsize=(30,10))
df['Sales'].plot(kind='line')
plt.show()
df['Quantity'].value_counts()
3 2400 2 2399 5 1230 4 1186 1 899 7 606 6 572 9 258 8 257 10 57 11 34 14 29 13 27 12 23 Name: Quantity, dtype: int64
sns.countplot(x=df['Quantity'], palette='Set2')
plt.show()
sns.countplot(x=df['Discount'], palette='Set2')
plt.show()
df['Profit'].value_counts()
0.0000 65
6.2208 42
9.3312 36
3.6288 31
5.4432 30
..
83.2508 1
16.1096 1
7.1988 1
1.6510 1
72.9480 1
Name: Profit, Length: 7287, dtype: int64
plt.figure(figsize=(30,10))
df['Profit'].plot(kind='line')
plt.show()
plt.figure(figsize=(10,5))
sns.heatmap(df.corr(), annot=True)
plt.show()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\2867204846.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(df.corr(), annot=True)
sns.pairplot(df, hue='Ship Mode')
plt.show()
ship_mode = df.groupby('Ship Mode')[['Profit', 'Sales']]
ship_mode.head()
| Profit | Sales | |
|---|---|---|
| 0 | 41.9136 | 261.9600 |
| 1 | 219.5820 | 731.9400 |
| 2 | 6.8714 | 14.6200 |
| 3 | -383.0310 | 957.5775 |
| 4 | 2.5164 | 22.3680 |
| 5 | 14.1694 | 48.8600 |
| 6 | 1.9656 | 7.2800 |
| 7 | 90.7152 | 907.1520 |
| 17 | 9.9900 | 55.5000 |
| 18 | 2.4824 | 8.5600 |
| 35 | 123.4737 | 1097.5440 |
| 36 | -147.9630 | 190.9200 |
| 44 | 19.7714 | 45.9800 |
| 45 | 8.2062 | 17.4600 |
| 55 | 52.1400 | 208.5600 |
| 366 | 10.4400 | 23.2000 |
| 367 | 0.1472 | 7.3600 |
| 368 | 29.3412 | 104.7900 |
| 369 | 271.4192 | 1043.9200 |
| 657 | -50.0980 | 701.3720 |
ship_mode.cov()
| Profit | Sales | ||
|---|---|---|---|
| Ship Mode | |||
| First Class | Profit | 66500.973629 | 119609.940039 |
| Sales | 119609.940039 | 397622.361191 | |
| Same Day | Profit | 51482.952083 | -6040.748196 |
| Sales | -6040.748196 | 308811.805446 | |
| Second Class | Profit | 23377.756306 | 38957.194668 |
| Sales | 38957.194668 | 312913.595466 | |
| Standard Class | Profit | 62640.769596 | 74363.646237 |
| Sales | 74363.646237 | 419100.307380 |
ship_mode.corr()
| Profit | Sales | ||
|---|---|---|---|
| Ship Mode | |||
| First Class | Profit | 1.000000 | 0.735560 |
| Sales | 0.735560 | 1.000000 | |
| Same Day | Profit | 1.000000 | -0.047908 |
| Sales | -0.047908 | 1.000000 | |
| Second Class | Profit | 1.000000 | 0.455485 |
| Sales | 0.455485 | 1.000000 | |
| Standard Class | Profit | 1.000000 | 0.458958 |
| Sales | 0.458958 | 1.000000 |
ship_mode.sum()
| Profit | Sales | |
|---|---|---|
| Ship Mode | ||
| First Class | 48953.6561 | 3.513805e+05 |
| Same Day | 15871.8869 | 1.283217e+05 |
| Second Class | 57446.6516 | 4.591770e+05 |
| Standard Class | 163969.2280 | 1.357316e+06 |
# df.plot(kind='line', x='Sales', y='Profit')
sns.lineplot(data=df, x='Sales', y='Profit')
plt.show()
sns.pairplot(df, hue='State', palette='Set2')
plt.show()
# sns.pairplot(df, hue='Sales', palette='Set2')
# sns.pairplot(df, hue='Profit', palette='Set2')
sns.pairplot(df, hue='Discount', palette='Set2')
plt.show()
state = df.groupby(['State'])[['Sales', 'Discount', 'Profit']]
state.head()
| Sales | Discount | Profit | |
|---|---|---|---|
| 0 | 261.9600 | 0.00 | 41.9136 |
| 1 | 731.9400 | 0.00 | 219.5820 |
| 2 | 14.6200 | 0.00 | 6.8714 |
| 3 | 957.5775 | 0.45 | -383.0310 |
| 4 | 22.3680 | 0.20 | 2.5164 |
| ... | ... | ... | ... |
| 7896 | 447.8400 | 0.00 | 219.4416 |
| 8798 | 1294.7500 | 0.00 | 336.6350 |
| 9146 | 99.9800 | 0.00 | 42.9914 |
| 9147 | 8.0400 | 0.00 | 2.7336 |
| 9486 | 673.3440 | 0.30 | -76.9536 |
240 rows × 3 columns
state.mean()
| Sales | Discount | Profit | |
|---|---|---|---|
| State | |||
| Alabama | 319.846557 | 0.000000 | 94.865989 |
| Arizona | 157.508933 | 0.303571 | -15.303235 |
| Arkansas | 194.635500 | 0.000000 | 66.811452 |
| California | 229.246629 | 0.072946 | 38.241878 |
| Colorado | 176.418231 | 0.316484 | -35.867351 |
| Connecticut | 163.223866 | 0.007317 | 42.823071 |
| Delaware | 285.948635 | 0.006250 | 103.930988 |
| District of Columbia | 286.502000 | 0.000000 | 105.958930 |
| Florida | 233.612815 | 0.299347 | -8.875461 |
| Georgia | 266.825217 | 0.000000 | 88.315453 |
| Idaho | 208.689810 | 0.085714 | 39.367767 |
| Illinois | 163.263823 | 0.389206 | -25.665275 |
| Indiana | 359.431946 | 0.000000 | 123.375411 |
| Iowa | 152.658667 | 0.000000 | 39.460397 |
| Kansas | 121.429583 | 0.000000 | 34.851813 |
| Kentucky | 263.250000 | 0.000000 | 80.573357 |
| Louisiana | 219.453095 | 0.000000 | 52.288150 |
| Maine | 158.816250 | 0.000000 | 56.810775 |
| Maryland | 225.766886 | 0.005714 | 66.963608 |
| Massachusetts | 212.106919 | 0.015556 | 50.262975 |
| Michigan | 298.738756 | 0.007087 | 96.173584 |
| Minnesota | 335.541011 | 0.000000 | 121.608847 |
| Mississippi | 203.232830 | 0.000000 | 59.867475 |
| Missouri | 336.441667 | 0.000000 | 97.518341 |
| Montana | 372.623467 | 0.066667 | 122.221900 |
| Nebraska | 196.445526 | 0.000000 | 53.607742 |
| Nevada | 428.951333 | 0.061538 | 85.045279 |
| New Hampshire | 270.093481 | 0.011111 | 63.203807 |
| New Jersey | 275.110092 | 0.004615 | 75.176260 |
| New Mexico | 129.284378 | 0.059459 | 31.273408 |
| New York | 275.800489 | 0.055368 | 65.674767 |
| North Carolina | 223.305880 | 0.283534 | -30.083985 |
| North Dakota | 131.415714 | 0.000000 | 32.878529 |
| Ohio | 166.617017 | 0.325000 | -36.237859 |
| Oklahoma | 298.233182 | 0.000000 | 73.544788 |
| Oregon | 141.632374 | 0.289431 | -9.708124 |
| Pennsylvania | 198.799253 | 0.328840 | -26.562122 |
| Rhode Island | 404.070643 | 0.021429 | 130.100523 |
| South Carolina | 201.945476 | 0.000000 | 42.120395 |
| South Dakota | 109.630000 | 0.000000 | 32.902358 |
| Tennessee | 167.551219 | 0.291257 | -29.189583 |
| Texas | 173.066675 | 0.370539 | -26.196321 |
| Utah | 211.699170 | 0.060377 | 48.047802 |
| Vermont | 811.760909 | 0.000000 | 204.088936 |
| Virginia | 315.342500 | 0.000000 | 83.026564 |
| Washington | 276.017550 | 0.064542 | 66.470593 |
| West Virginia | 302.456000 | 0.075000 | 46.480400 |
| Wisconsin | 291.951000 | 0.000000 | 76.380004 |
| Wyoming | 1603.136000 | 0.200000 | 100.196000 |
state.std()
| Sales | Discount | Profit | |
|---|---|---|---|
| State | |||
| Alabama | 545.761807 | 0.000000 | 210.901876 |
| Arizona | 249.710692 | 0.197031 | 109.266267 |
| Arkansas | 316.405669 | 0.000000 | 123.249880 |
| California | 491.510940 | 0.095275 | 97.803399 |
| Colorado | 324.415072 | 0.208792 | 276.424024 |
| Connecticut | 251.732268 | 0.046562 | 66.138081 |
| Delaware | 1112.818485 | 0.043073 | 518.953513 |
| District of Columbia | 547.419707 | 0.000000 | 213.461353 |
| Florida | 1205.490630 | 0.192784 | 126.324165 |
| Georgia | 637.796228 | 0.000000 | 283.026694 |
| Idaho | 282.061093 | 0.101419 | 63.027976 |
| Illinois | 317.364546 | 0.244669 | 175.872237 |
| Indiana | 1481.538652 | 0.000000 | 693.643105 |
| Iowa | 326.637475 | 0.000000 | 73.763444 |
| Kansas | 124.606356 | 0.000000 | 42.619311 |
| Kentucky | 472.818538 | 0.000000 | 171.126404 |
| Louisiana | 365.264153 | 0.000000 | 83.357777 |
| Maine | 146.676370 | 0.000000 | 53.816911 |
| Maryland | 396.914182 | 0.041204 | 148.214655 |
| Massachusetts | 336.924698 | 0.066766 | 94.386407 |
| Michigan | 827.737778 | 0.025711 | 375.279834 |
| Minnesota | 1070.678744 | 0.000000 | 505.784116 |
| Mississippi | 365.834352 | 0.000000 | 80.793505 |
| Missouri | 830.866752 | 0.000000 | 308.032711 |
| Montana | 784.412796 | 0.097590 | 350.364144 |
| Nebraska | 459.444553 | 0.000000 | 133.163393 |
| Nevada | 881.267531 | 0.093514 | 265.901543 |
| New Hampshire | 475.181739 | 0.057735 | 119.893107 |
| New Jersey | 890.520072 | 0.037066 | 236.387912 |
| New Mexico | 190.093868 | 0.092675 | 39.290011 |
| New York | 691.990620 | 0.096589 | 232.640276 |
| North Carolina | 648.754454 | 0.178959 | 297.900997 |
| North Dakota | 256.602857 | 0.000000 | 58.123254 |
| Ohio | 353.599895 | 0.188295 | 341.709917 |
| Oklahoma | 431.874240 | 0.000000 | 109.026519 |
| Oregon | 215.583223 | 0.185902 | 74.744861 |
| Pennsylvania | 531.457950 | 0.189748 | 136.404709 |
| Rhode Island | 853.179916 | 0.077961 | 384.816449 |
| South Carolina | 318.462333 | 0.000000 | 73.698963 |
| South Dakota | 148.257140 | 0.000000 | 43.803938 |
| Tennessee | 360.678518 | 0.184956 | 176.636392 |
| Texas | 424.831382 | 0.243489 | 189.207658 |
| Utah | 378.723382 | 0.092694 | 83.929399 |
| Vermont | 1309.310745 | 0.000000 | 301.653921 |
| Virginia | 792.001296 | 0.000000 | 235.266720 |
| Washington | 780.008540 | 0.093596 | 326.560751 |
| West Virginia | 313.508627 | 0.150000 | 125.265494 |
| Wisconsin | 461.590111 | 0.000000 | 124.946125 |
| Wyoming | NaN | NaN | NaN |
state.corr()
| Sales | Discount | Profit | ||
|---|---|---|---|---|
| State | ||||
| Alabama | Sales | 1.000000 | NaN | 0.896932 |
| Discount | NaN | NaN | NaN | |
| Profit | 0.896932 | NaN | 1.000000 | |
| Arizona | Sales | 1.000000 | -0.05654 | -0.242726 |
| Discount | -0.056540 | 1.00000 | -0.427620 | |
| ... | ... | ... | ... | ... |
| Wisconsin | Discount | NaN | NaN | NaN |
| Profit | 0.952947 | NaN | 1.000000 | |
| Wyoming | Sales | NaN | NaN | NaN |
| Discount | NaN | NaN | NaN | |
| Profit | NaN | NaN | NaN |
147 rows × 3 columns
plt.figure(figsize=(10,5))
sns.heatmap(state.corr().head(), annot=True)
plt.show()
sns.lineplot(data=df, x='Discount', y='Profit')
plt.show()
sns.barplot(data=df, x='Discount', y='Profit', palette='Set2')
plt.show()
sns.lineplot(data=df, x='Sales', y='Profit')
plt.show()
sns.lineplot(data=df, x='Sales', y='Discount')
plt.show()
city = df.groupby(['City'])[['Sales', 'Discount', 'Profit']].mean()
city = city.sort_values('Profit')
city.head()
| Sales | Discount | Profit | |
|---|---|---|---|
| City | |||
| Bethlehem | 337.926800 | 0.380000 | -200.619160 |
| Champaign | 151.960000 | 0.600000 | -182.352000 |
| Oswego | 107.326000 | 0.600000 | -178.709200 |
| Round Rock | 693.436114 | 0.274286 | -169.061614 |
| Lancaster | 215.031826 | 0.315217 | -157.371052 |
# For High Profit Analysis
city['Profit'].tail(50).plot(kind='bar', figsize=(15,5), color='green')
plt.title('City Wise Sales Discount & Profit Analysis')
plt.show()
# For Low Profit Analysis
city['Profit'].head(50).plot(kind='bar', figsize=(15,5), color='red')
plt.title('City Wise Sales Discount & Profit Analysis')
plt.show()
quantity = df.groupby(['Quantity'])[['Sales', 'Discount', 'Profit']]
quantity = df.groupby(['Quantity'])[['Sales', 'Discount', 'Profit']].mean()
quantity.head(10)
| Sales | Discount | Profit | |
|---|---|---|---|
| Quantity | |||
| 1 | 59.234632 | 0.152959 | 8.276396 |
| 2 | 120.377982 | 0.154844 | 16.027769 |
| 3 | 175.630896 | 0.153321 | 23.720306 |
| 4 | 272.779827 | 0.158373 | 37.227971 |
| 5 | 337.936339 | 0.157146 | 40.257394 |
| 6 | 362.101960 | 0.166556 | 18.051517 |
| 7 | 395.888393 | 0.161980 | 56.579163 |
| 8 | 458.210802 | 0.171595 | 42.244342 |
| 9 | 498.083683 | 0.147946 | 68.557716 |
| 10 | 422.046737 | 0.190702 | 35.862404 |
# 1_Sales 2_Discount 3_Profit
quantity.plot.pie(subplots=True, autopct='%1.1f%%', figsize=(25,25),
pctdistance=0.69, startangle=90, shadow=True, labels=quantity.index)
plt.title('Quantity Wise Sales Discount & Profit Analysis', fontsize=30)
plt.show()
category = df.groupby(['Category'])[['Sales', 'Discount', 'Profit']]
category = df.groupby(['Category'])[['Sales', 'Discount', 'Profit']].mean()
category
| Sales | Discount | Profit | |
|---|---|---|---|
| Category | |||
| Furniture | 350.002981 | 0.174027 | 8.697740 |
| Office Supplies | 119.550107 | 0.157385 | 20.353403 |
| Technology | 452.709276 | 0.132323 | 78.752002 |
category.plot.pie(subplots=True, figsize=(18,20), autopct='%1.1f%%', labels=category.index)
plt.title('Category Wise Sales Discount & Profit Analysis', fontsize=15)
plt.show()
sub_category = df.groupby(['Sub-Category'])[['Sales', 'Discount', 'Profit']]
sub_category = df.groupby(['Sub-Category'])[['Sales', 'Discount', 'Profit']].mean()
sub_category
| Sales | Discount | Profit | |
|---|---|---|---|
| Sub-Category | |||
| Accessories | 215.974604 | 0.078452 | 54.111788 |
| Appliances | 230.755710 | 0.166524 | 38.922758 |
| Art | 34.096896 | 0.074969 | 8.207059 |
| Binders | 133.645972 | 0.372011 | 19.860710 |
| Bookcases | 503.859633 | 0.211140 | -15.230509 |
| Chairs | 532.971969 | 0.170244 | 43.198582 |
| Copiers | 2198.941618 | 0.161765 | 817.909190 |
| Envelopes | 64.867724 | 0.080315 | 27.418019 |
| Fasteners | 13.936774 | 0.082028 | 4.375660 |
| Furnishings | 95.902745 | 0.138494 | 13.653476 |
| Labels | 34.283504 | 0.068871 | 15.224193 |
| Machines | 1645.553313 | 0.306087 | 29.432669 |
| Paper | 57.560075 | 0.074908 | 24.977365 |
| Phones | 371.211534 | 0.154556 | 50.073938 |
| Storage | 264.590553 | 0.074704 | 25.152277 |
| Supplies | 245.650200 | 0.076842 | -6.258418 |
| Tables | 648.794771 | 0.261285 | -55.565771 |
plt.figure(figsize=(25,25))
plt.pie(sub_category['Sales'], labels=sub_category.index, autopct='%1.1f%%')
plt.title('Sub_Category Wise Sales Profit & Discount Analysis', fontsize=30)
plt.legend()
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(25,25))
plt.pie(sub_category['Discount'], labels=sub_category.index, autopct='%1.1f%%')
plt.title('Sub_Category Wise Discount Analysis', fontsize=30)
plt.legend()
plt.xticks(rotation=90)
plt.show()
sub_category.sort_values('Profit')[['Sales','Profit']].plot(kind='bar',
figsize=(20,10),
label=['Average Sales Price($)', 'Profit($)'])
plt.title('Sub_Category Wise Profit Analysis', fontsize=30)
plt.legend()
plt.show()
region = df.groupby(['Region'])[['Sales', 'Discount', 'Profit']]
region = df.groupby(['Region'])[['Sales', 'Discount', 'Profit']].mean()
region
| Sales | Discount | Profit | |
|---|---|---|---|
| Region | |||
| Central | 215.947759 | 0.240250 | 17.100421 |
| East | 238.465798 | 0.145343 | 32.163905 |
| South | 241.803645 | 0.147253 | 28.857673 |
| West | 227.139254 | 0.109615 | 33.927281 |
region.plot.pie(subplots=True, figsize=(25,20), autopct='%1.1f%%', labels=region.index)
plt.title('Region Wise Profit Analysis', fontsize=30)
plt.legend()
plt.show()
df.hist(bins=50 ,figsize=(20,15))
plt.show()
sns.pairplot(df, hue='Sub-Category', palette='Set2')
plt.show()
def state_data_viewer(states):
product_data = df.groupby(['State'])
for state in states:
data = product_data.get_group(state).groupby(['Category'])
fig, ax = plt.subplots(1,3, figsize=(28,5))
fig.suptitle(state, fontsize=14)
ax_index = 0
for cat in ['Furniture', 'Office Supplies', 'Technology']:
cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
sns.barplot(x=cat_data.Profit, y=cat_data.index, ax=ax[ax_index], palette='Set2')
ax[ax_index].set_ylabel(cat)
ax_index +=1
fig.show()
states = ['California', 'Washington', 'Mississippi', 'Arizona', 'Texas']
state_data_viewer(states)
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:14: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure. fig.show() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:14: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure. fig.show() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:14: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure. fig.show() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:14: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure. fig.show() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:10: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. cat_data = data.get_group(cat).groupby(['Sub-Category']).sum() C:\Users\Lenovo\AppData\Local\Temp\ipykernel_6168\1894147851.py:14: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure. fig.show()
x = df.iloc[:, [9,10,11,12]].values
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++',
max_iter = 300, n_init = 10,
random_state=0).fit(x)
wcss.append(kmeans.inertia_)
sns.set_style('whitegrid')
sns.FacetGrid(df, hue='Sub-Category', height=6).map(plt.scatter, 'Sales', 'Quantity')
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1],
s=100, c='yellow', label='Centroids')
plt.legend()
plt.show()
sns.set_style('whitegrid')
sns.FacetGrid(df, hue='Sub-Category', height=6).map(plt.scatter, 'Sales', 'Profit')
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1],
s=100, c='yellow', label='Centroids')
plt.legend()
plt.show()
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['Sales'] , df['Profit'])
ax.set_xlabel('Sales')
ax.set_ylabel('Profit')
ax.set_title('Sales & Profit')
plt.show()